from active_learning_dd.models.supervised import *
from active_learning_dd.utils.data_utils import *
import pandas as pd
import numpy as np
import glob
import warnings
warnings.simplefilter("ignore")
%load_ext autoreload
%autoreload 2
root_dir = '../../../aldd_results/aldd_exp_3_final//params_results\\'
def run_rf(task_col, hit_perc, hit_limit, uncert_method='least_confidence', normalize_uncert=False):
af = root_dir+'sampled_hyparams/ClusterBasedWCSelector_609/{}/0/batch_size_96/'.format(task_col)
for iter_max in [1, 10, 20, 30, 40, 50]:
train_files = [af+'/training_data/iter_{}.csv'.format(i) for i in range(iter_max)]
train_df = pd.concat([pd.read_csv(x) for x in train_files])
unlabeled_df = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/pcba/{}_cv_96/*.csv'.format(task_col))])
unlabeled_df = unlabeled_df.drop_duplicates('Index ID')
unlabeled_df = unlabeled_df[~unlabeled_df['Index ID'].isin(train_df['Index ID'].values)]
X_train = np.vstack([ (np.fromstring(x, 'u1') - ord('0')).astype(np.uint16) for x in train_df['Morgan FP_2_1024'] ])
X_test = np.vstack([ (np.fromstring(x, 'u1') - ord('0')).astype(np.uint16) for x in unlabeled_df['Morgan FP_2_1024'] ])
y_train, y_test = train_df[task_col].values.reshape(-1,1), unlabeled_df[task_col].values.reshape(-1,1)
model = sklearn_randomforest.SklearnRF(task_names=[task_col],
n_estimators=50,
max_features="log2",
min_samples_leaf=1,
n_jobs=1,
class_weight="balanced",
random_state=20183112,
oob_score=False,
verbose=0)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
test_uncert = model.get_uncertainty(X_test, uncert_method, [])
if normalize_uncert:
x = (test_preds - np.min(test_preds)) / (np.max(test_preds) - np.min(test_preds))
test_uncert = 1 - (np.abs(2*x - 1))
test_actives = np.where(y_test[:,0] == 1)[0]
test_inactives = np.where(y_test[:,0] == 0)[0]
topk = 96
x = test_preds[:,0].argsort()[::-1][:topk]
y = test_uncert[:,0].argsort()[::-1][:topk]
overlap = np.intersect1d(x, y).shape[0]
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context("paper")
sns.set(font_scale=1.5)
figsize=(22, 10)
fig_title = 'Task {} RF predictions kernel density estimate after {} iteration\nwith {} training compounds and {} unlabeled pool compounds.'.format(task_col,
iter_max,
y_train.shape[0],
y_test.shape[0])
fig_title += ' Overlap top {}: {}.'.format(topk, overlap)
fig, axes = plt.subplots(1, 2, figsize=figsize)
sns.distplot(test_preds[test_actives,0], kde_kws={"linewidth": 2}, ax=axes[0], color='darkgreen')
sns.distplot(test_preds[test_inactives,0], kde_kws={"linewidth": 2}, ax=axes[0], color='coral')
axes[0].legend(['hits', 'non-hits'])
axes[0].set_title('RF predictions distribution')
sns.distplot(test_uncert[test_actives,0], kde_kws={"linewidth": 2}, ax=axes[1], color='darkgreen')
sns.distplot(test_uncert[test_inactives,0], kde_kws={"linewidth": 2}, ax=axes[1], color='coral')
axes[1].legend(['hits', 'non-hits'])
axes[1].set_title('RF uncertainty (distance from 0.5) distribution')
fig.suptitle(fig_title)
fig.tight_layout()
fig.subplots_adjust(top=.85)
plt.show()
task_col, hit_perc, hit_limit = 'pcba-aid588456', 0.01, 51
run_rf(task_col, hit_perc, hit_limit)
task_col, hit_perc, hit_limit = 'pcba-aid602310', 0.08, 310
run_rf(task_col, hit_perc, hit_limit)
task_col, hit_perc, hit_limit = 'pcba-aid1458', 2.97, 5778
run_rf(task_col, hit_perc, hit_limit)
At iteration 1, the range of the preds of random forest are small [0, 0.4] which means that activity score AND uncertainty score (distance from 0.5) give same ORDERING. In iterations 10 and after, range of scores depends on the hit % of the task. Tasks with fewer hits will have a small prediction range than task with more hits. This is likely due to the model not seeing enough hit examples to make a confident probability estimate.
The task pcba-aid602310 has 5778 hits which is one of the highest hit counts in 128-PCBA. With this task, we can see that the right tail of the uncertainty plots are heavier for hits (green) than non-hits (orange); more hits will be selected for exploration. This trend is still visible at 50 iterations.
task_col, hit_perc, hit_limit = 'pcba-aid588456', 0.01, 51
run_rf(task_col, hit_perc, hit_limit, normalize_uncert=True)
task_col, hit_perc, hit_limit = 'pcba-aid602310', 0.08, 310
run_rf(task_col, hit_perc, hit_limit, normalize_uncert=True)
task_col, hit_perc, hit_limit = 'pcba-aid1458', 2.97, 5778
run_rf(task_col, hit_perc, hit_limit, normalize_uncert=True)
task_col, hit_perc, hit_limit = 'pcba-aid588456', 0.01, 51
run_rf(task_col, hit_perc, hit_limit, uncert_method='query_by_committee', normalize_uncert=False)
task_col, hit_perc, hit_limit = 'pcba-aid602310', 0.08, 310
run_rf(task_col, hit_perc, hit_limit, uncert_method='query_by_committee', normalize_uncert=False)
task_col, hit_perc, hit_limit = 'pcba-aid1458', 2.97, 5778
run_rf(task_col, hit_perc, hit_limit, uncert_method='query_by_committee', normalize_uncert=False)
To fix/remedy this, we can probably normalize predictions of the tested pool in range from 0-1. This would show that uncertainty scores (distance from 0.5) is clearly different that activity score.
So in summary, early iterations it is likely that exploration-heavy and exploitation-heavy select similar cpds since activity vs uncertainty scores are similar, but in late iterations the range of scores becomes wider and so differences arise.
def run_nn(task_col, hit_perc, hit_limit, uncert_method='least_confidence', normalize_uncert=False):
af = root_dir+'sampled_hyparams/ClusterBasedWCSelector_609/{}/0/batch_size_96/'.format(task_col)
for iter_max in [1, 10, 20, 30, 40, 50]:
train_files = [af+'/training_data/iter_{}.csv'.format(i) for i in range(iter_max)]
train_df = pd.concat([pd.read_csv(x) for x in train_files])
unlabeled_df = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/pcba/{}_cv_96/*.csv'.format(task_col))])
unlabeled_df = unlabeled_df.drop_duplicates('Index ID')
unlabeled_df = unlabeled_df[~unlabeled_df['Index ID'].isin(train_df['Index ID'].values)]
X_train = np.vstack([ (np.fromstring(x, 'u1') - ord('0')).astype(np.uint16) for x in train_df['Morgan FP_2_1024'] ])
X_test = np.vstack([ (np.fromstring(x, 'u1') - ord('0')).astype(np.uint16) for x in unlabeled_df['Morgan FP_2_1024'] ])
y_train, y_test = train_df[task_col].values.reshape(-1,1), unlabeled_df[task_col].values.reshape(-1,1)
model = neural_network.SimpleNN(task_names=[task_col],
n_features=1024,
batch_size=2056,
epochs=200,
verbose=0)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
test_uncert = model.get_uncertainty(X_test, uncert_method, [])
if normalize_uncert:
x = (test_preds - np.min(test_preds)) / (np.max(test_preds) - np.min(test_preds))
test_uncert = 1 - (np.abs(2*x - 1))
test_actives = np.where(y_test[:,0] == 1)[0]
test_inactives = np.where(y_test[:,0] == 0)[0]
topk = 96
x = test_preds[:,0].argsort()[::-1][:topk]
y = test_uncert[:,0].argsort()[::-1][:topk]
overlap = np.intersect1d(x, y).shape[0]
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context("paper")
sns.set(font_scale=1.5)
figsize=(22, 10)
fig_title = 'Task {} RF predictions kernel density estimate after {} iteration\nwith {} training compounds and {} unlabeled pool compounds.'.format(task_col,
iter_max,
y_train.shape[0],
y_test.shape[0])
fig_title += ' Overlap top {}: {}.'.format(topk, overlap)
fig, axes = plt.subplots(1, 2, figsize=figsize)
sns.distplot(test_preds[test_actives,0], kde_kws={"linewidth": 2}, ax=axes[0], color='darkgreen')
sns.distplot(test_preds[test_inactives,0], kde_kws={"linewidth": 2}, ax=axes[0], color='coral')
axes[0].legend(['hits', 'non-hits'])
axes[0].set_title('RF predictions distribution')
sns.distplot(test_uncert[test_actives,0], kde_kws={"linewidth": 2}, ax=axes[1], color='darkgreen')
sns.distplot(test_uncert[test_inactives,0], kde_kws={"linewidth": 2}, ax=axes[1], color='coral')
axes[1].legend(['hits', 'non-hits'])
axes[1].set_title('RF uncertainty (distance from 0.5) distribution')
fig.suptitle(fig_title)
fig.tight_layout()
fig.subplots_adjust(top=.85)
plt.show()
task_col, hit_perc, hit_limit = 'pcba-aid588456', 0.01, 51
run_nn(task_col, hit_perc, hit_limit)
task_col, hit_perc, hit_limit = 'pcba-aid602310', 0.08, 310
run_nn(task_col, hit_perc, hit_limit)
task_col, hit_perc, hit_limit = 'pcba-aid1458', 2.97, 5778
run_nn(task_col, hit_perc, hit_limit)
from sklearn.svm import SVC
task_col = 'pcba-aid602310'
for iter_max in [1, 10, 20, 30, 40, 50]:
af = root_dir+'sampled_hyparams/ClusterBasedWCSelector_609/{}/0/batch_size_96/'.format(task_col)
train_files = [af+'/training_data/iter_{}.csv'.format(i) for i in range(iter_max)]
train_df = pd.concat([pd.read_csv(x) for x in train_files])
unlabeled_df = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/pcba/{}_cv_96/*.csv'.format(task_col))])
unlabeled_df = unlabeled_df.drop_duplicates('Index ID')
unlabeled_df = unlabeled_df[~unlabeled_df['Index ID'].isin(train_df['Index ID'].values)]
X_train = np.vstack([ (np.fromstring(x, 'u1') - ord('0')).astype(np.uint16) for x in train_df['Morgan FP_2_1024'] ])
X_test = np.vstack([ (np.fromstring(x, 'u1') - ord('0')).astype(np.uint16) for x in unlabeled_df['Morgan FP_2_1024'] ])
y_train, y_test = train_df[task_col].values.reshape(-1,1), unlabeled_df[task_col].values.reshape(-1,1)
model = SVC(class_weight='balanced')
model.fit(X_train, y_train)
train_svm_dist = model.decision_function(X_train)
test_svm_dist = model.decision_function(X_test)
test_preds = test_svm_dist
test_uncert = np.abs(test_svm_dist)
test_actives = np.where(y_test[:,0] == 1)[0]
test_inactives = np.where(y_test[:,0] == 0)[0]
topk = 96
x = test_preds.argsort()[::-1][:topk] # max distance is most certain
y = test_uncert.argsort()[:topk] # min distance is most uncertain
overlap = np.intersect1d(x, y).shape[0]
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context("paper")
sns.set(font_scale=1.5)
figsize=(22, 10)
fig_title = 'Task {} RF predictions kernel density estimate after {} iteration\nwith {} training compounds and {} unlabeled pool compounds.'.format(task_col,
iter_max,
y_train.shape[0],
y_test.shape[0])
fig_title += ' Overlap top {}: {}.'.format(topk, overlap)
fig, axes = plt.subplots(1, 2, figsize=figsize)
sns.distplot(test_preds[test_actives], kde_kws={"linewidth": 2}, ax=axes[0], color='darkgreen')
sns.distplot(test_preds[test_inactives], kde_kws={"linewidth": 2}, ax=axes[0], color='coral')
axes[0].legend(['hits', 'non-hits'])
axes[0].set_title('RF predictions distribution')
sns.distplot(test_uncert[test_actives], kde_kws={"linewidth": 2}, ax=axes[1], color='darkgreen')
sns.distplot(test_uncert[test_inactives], kde_kws={"linewidth": 2}, ax=axes[1], color='coral')
axes[1].legend(['hits', 'non-hits'])
axes[1].set_title('RF uncertainty (distance from 0.5) distribution')
fig.suptitle(fig_title)
fig.tight_layout()
fig.subplots_adjust(top=.85)
plt.show()
task_col = 'pcba-aid602310'
iter_max = 1
af = root_dir+'sampled_hyparams/ClusterBasedWCSelector_609/{}/0/batch_size_96/'.format(task_col)
train_files = [af+'/training_data/iter_{}.csv'.format(i) for i in range(iter_max)]
train_df = pd.concat([pd.read_csv(x) for x in train_files])
unlabeled_df = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/pcba/{}_cv_96/*.csv'.format(task_col))])
unlabeled_df = unlabeled_df.drop_duplicates('Index ID')
unlabeled_df = unlabeled_df[~unlabeled_df['Index ID'].isin(train_df['Index ID'].values)]
X_train = np.vstack([ (np.fromstring(x, 'u1') - ord('0')).astype(np.uint16) for x in train_df['Morgan FP_2_1024'] ])
X_test = np.vstack([ (np.fromstring(x, 'u1') - ord('0')).astype(np.uint16) for x in unlabeled_df['Morgan FP_2_1024'] ])
y_train, y_test = train_df[task_col].values.reshape(-1,1), unlabeled_df[task_col].values.reshape(-1,1)
model = sklearn_randomforest.SklearnRF(task_names=[task_col],
n_estimators=50,
max_features="log2",
min_samples_leaf=1,
n_jobs=1,
class_weight="balanced",
random_state=20183112,
oob_score=False,
verbose=0)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
test_uncert = model.get_uncertainty(X_test, feature_dist_func_dict()['tanimoto_dissimilarity'],
'density_weight', [1.0, True])
test_actives = np.where(y_test[:,0] == 1)[0]
test_inactives = np.where(y_test[:,0] == 0)[0]
topk = 96
x = test_preds[:,0].argsort()[::-1][:topk]
y = test_uncert[:,0].argsort()[::-1][:topk]
overlap = np.intersect1d(x, y).shape[0]
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context("paper")
sns.set(font_scale=1.5)
figsize=(22, 10)
fig_title = 'Task {} RF predictions kernel density estimate after {} iteration\nwith {} training compounds and {} unlabeled pool compounds.'.format(task_col,
iter_max,
y_train.shape[0],
y_test.shape[0])
fig_title += ' Overlap top {}: {}.'.format(topk, overlap)
fig, axes = plt.subplots(1, 2, figsize=figsize)
sns.distplot(test_preds[test_actives,0], kde_kws={"linewidth": 2}, ax=axes[0], color='darkgreen')
sns.distplot(test_preds[test_inactives,0], kde_kws={"linewidth": 2}, ax=axes[0], color='coral')
axes[0].legend(['hits', 'non-hits'])
axes[0].set_title('RF predictions distribution')
sns.distplot(test_uncert[test_actives,0], kde_kws={"linewidth": 2}, ax=axes[1], color='darkgreen')
sns.distplot(test_uncert[test_inactives,0], kde_kws={"linewidth": 2}, ax=axes[1], color='coral')
axes[1].legend(['hits', 'non-hits'])
axes[1].set_title('RF uncertainty (distance from 0.5) distribution')
fig.suptitle(fig_title)
fig.tight_layout()
fig.subplots_adjust(top=.85)
plt.show()
test_uncert = model.get_uncertainty(X_test, 'density_weight',
[feature_dist_func_dict()['tanimoto_dissimilarity'], 1.0, True])